library("tidyverse")
library("tibble")
library("msigdbr")
library("ggplot2")
library("TCGAbiolinks")
library("RNAseqQC")
library("DESeq2")
library("ensembldb")
library("purrr")
library("magrittr")
library("vsn")
library("matrixStats")
library("dplyr")
library("grex")
library("biomaRt")
Download miRNA expression data from The Cancer Genome Atlas (TCGA): -
TCGA-COAD
refers to the biospecimen data for colon
adenocarcinoma.
query_tumor <- GDCquery(
project = "TCGA-COAD",
data.category = "Transcriptome Profiling",
data.type = "miRNA Expression Quantification",
experimental.strategy = "miRNA-Seq",
access = "open",
sample.type = "Primary Tumor"
)
tumor <- getResults(query_tumor)
tumor
query_normal <- GDCquery(
project = "TCGA-COAD",
data.category = "Transcriptome Profiling",
data.type = "miRNA Expression Quantification",
experimental.strategy = "miRNA-Seq",
access = "open",
sample.type = "Solid Tissue Normal"
)
normal <- getResults(query_normal)
normal
Consider only samples with both normal and malignant tissues.
submitter_ids <- inner_join(tumor, normal, by = "cases.submitter_id") %>%
dplyr::select(cases.submitter_id)
tumor <- tumor %>%
dplyr::filter(cases.submitter_id %in% submitter_ids$cases.submitter_id)
normal <- normal %>%
dplyr::filter(cases.submitter_id %in% submitter_ids$cases.submitter_id)
samples <- rbind(tumor, normal)
invisible(unique(samples$sample_type))
samples
Download only samples with both normal and malignant tissues.
To impose this filtering, we set the barcode
argument of
GDCquery
to
samples$sample.submitter_id
(which was generated in the
previous cell).
query_coad <- GDCquery(
project = "TCGA-COAD",
data.category = "Transcriptome Profiling",
data.type = "miRNA Expression Quantification",
experimental.strategy = "miRNA-Seq",
access = "open",
sample.type = c("Solid Tissue Normal", "Primary Tumor"),
barcode = as.list(samples$sample.submitter_id)
)
If this is your first time running this notebook (i.e., you have not yet downloaded the results of the query in the previous block), uncomment the code block below.
# GDCdownload(query_coad)
Running the code block above should generate and populate a directory
named GDCdata
.
Construct the RNA-seq count matrix.
tcga_coad_data <- GDCprepare(query_coad, summarizedExperiment = TRUE)
rownames(tcga_coad_data) <- tcga_coad_data$miRNA_ID
count_matrix <- tcga_coad_data[, colnames(tcga_coad_data)[grep("count", colnames(tcga_coad_data))]]
colnames(count_matrix) <- gsub("read_count_", "", colnames(count_matrix))
# Remove duplicate entries
count_matrix_df <- data.frame(count_matrix)
count_matrix_df <- count_matrix_df[!duplicated(count_matrix_df), ]
count_matrix <- data.matrix(count_matrix_df)
rownames(count_matrix) <- cleanid(rownames(count_matrix))
count_matrix <- count_matrix[!(duplicated(rownames(count_matrix)) | duplicated(rownames(count_matrix), fromLast = TRUE)), ]
head(count_matrix[1:5, 1:4])
TCGA.A6.2671.01A.01T.1409.13 TCGA.A6.2683.01A.01T.0822.13 TCGA.A6.2680.01A.01T.1409.13 TCGA.AA.3520.01A.01T.0822.13
hsa-let-7a-1 11763 50288 6055 4503
hsa-let-7a-2 11992 50537 5945 4589
hsa-let-7a-3 11822 51098 6018 4619
hsa-let-7b 15937 143822 9392 19505
hsa-let-7c 1488 3943 141 751
Format the samples
table so that it can be fed as input
to DESeq2.
rownames(samples) <- samples$cases
samples <- samples %>%
dplyr::select(case = "cases.submitter_id", type = "sample_type")
samples$type <- str_replace(samples$type, "Solid Tissue Normal", "normal")
samples$type <- str_replace(samples$type, "Primary Tumor", "tumor")
DESeq2 requires the row names of samples
should be
identical to the column names of count_matrix
.
colnames(count_matrix) <- gsub(x = colnames(count_matrix), pattern = "\\.", replacement = "-")
count_matrix <- count_matrix[, rownames(samples)]
# Sanity check
all(colnames(count_matrix) == rownames(samples))
References:
Construct the DESeqDataSet
object.
dds <- DESeqDataSetFromMatrix(
countData = count_matrix,
colData = samples,
design = ~type
)
Warning: some variables in design formula are characters, converting to factors
Display quality control (QC) plots (refer to https://cran.r-project.org/web/packages/RNAseqQC/vignettes/introduction.html)
CAVEAT: There seem to be some outliers — but we defer handling them for now!
plot_total_counts(dds)
plot_library_complexity(dds)
plot_gene_detection(dds)
Perform miRNA filtering.
We determined min_count
empirically by looking at the
red trend line in the variance stabilization plot. Ideally, this
trend line should be flat (i.e., stable).
dds <- filter_genes(dds, min_count = 10)
Transform the read counts.
From
https://chipster.csc.fi/manual/deseq2-transform.html:
You can use the resulting transformed values only for visualization
and clustering, not for differential expression analysis which needs
raw counts.
dds <- estimateSizeFactors(dds)
nsub <- sum(rowMeans(counts(dds, normalized = TRUE)) > 10)
vsd <- vst(dds, nsub = nsub)
mean_sd_plot(vsd)
Check the clustering of the samples.
If you encounter the error
Error in loadNamespace(x) : there is no package called
'ComplexHeatmap'
, uncomment and run the following code block:
# install.packages("devtools", dependencies = TRUE)
# devtools::install_github("jokergoo/ComplexHeatmap")
set.seed(42)
plot_sample_clustering(vsd, anno_vars = c("type"), distance = "euclidean")
Perform principal component analysis (PCA).
plot_pca(vsd, PC_x = 1, PC_y = 2, shape_by = "type")
Refer to
1. Exploratory Data Analysis - MSigDB Gene Sets + GTEx
TPM.Rmd
for more detailed documentation on obtaining the gene sets.
We are going to refer to miRDB to for the miRNA-mRNA mapping: https://academic.oup.com/nar/article/48/D1/D127/5557729
Download and uncompress miRDB v6.0 by running:
wget https://mirdb.org/download/miRDB_v6.0_prediction_result.txt.gz -P data/
gzip -d miRDB_v6.0_prediction_result.txt.gz
The first column of the dataset lists the miRNAs, while the second
column lists the mRNAs (more specifically, the RefSeq IDs of the
mRNAs). However, our gene sets list the mRNAs of interest (i.e.,
those involved in regulated cell death) using their gene symbols.
Hence, we need to perform some preprocessing to convert the gene
symbols to RefSeq IDs.
mart <- useMart(biomart = "ensembl", dataset = "hsapiens_gene_ensembl")
Alternative miRNA-mRNA mapping tools/databases:
Fetch the necroptosis gene set.
necroptosis.genes <- msigdbr(species = "human", category = "C5", subcategory = "GO:BP") %>%
dplyr::filter(gs_name == "GOBP_NECROPTOTIC_SIGNALING_PATHWAY")
necroptosis.genes
Get the gene symbols of the genes included in the necroptosis gene set, and convert them to RefSeq IDs.
rcd_gene_symbols <- necroptosis.genes$gene_symbol
rcd_refseq <- getBM(attributes = c("refseq_mrna", "hgnc_symbol"), filters = "hgnc_symbol", values = rcd_gene_symbols, mart = mart)
rcd_refseq
Write the RefSeq IDs of the mRNAs of interest to a file.
rcd_mrna_file <- "temp/necroptosis-genes-refseq.txt"
rcd_refseq.unique_ids <- unique(unlist(rcd_refseq$refseq_mrna))
rcd_refseq.unique_ids <- rcd_refseq.unique_ids[!sapply(rcd_refseq.unique_ids, identical, "")]
# Regenerate the file every time
if (file.exists(rcd_mrna_file)) {
file.remove(rcd_mrna_file)
}
invisible(lapply(rcd_refseq.unique_ids, write, rcd_mrna_file, append = TRUE, ncolumns = 1))
Run the following Python script to fetch the miRNAs targeting the mRNAs of interest:
python util/get-mirna.py --mrna-list temp/necroptosis-genes-refseq.txt --output temp/necroptosis-mirna.txt
Running this script should generate a file named
necroptosis-mirna.txt
inside the
temp
directory.
Fetch the miRNAs from the said file.
necroptosis.mirna <- read.table("temp/necroptosis-mirna.txt")
necroptosis.mirna <- unique(unlist(necroptosis.mirna$V1))
Filter the genes to include only those in the necroptosis gene set.
coad_necroptosis <- count_matrix[rownames(count_matrix) %in% necroptosis.mirna, ]
coad_necroptosis <- coad_necroptosis[, rownames(samples)]
# Check if all samples in the counts dataframe are in the samples dataframe
all(colnames(coad_necroptosis) == rownames(samples))
Perform differential miRNA expression analysis.
dds <- DESeqDataSetFromMatrix(
countData = coad_necroptosis,
colData = samples,
design = ~type
)
Warning: some variables in design formula are characters, converting to factors
dds <- filter_genes(dds, min_count = 10)
dds$type <- relevel(dds$type, ref = "normal")
dds <- DESeq(dds)
estimating size factors
estimating dispersions
gene-wise dispersion estimates
mean-dispersion relationship
final dispersion estimates
fitting model and testing
-- replacing outliers and refitting for 5 genes
-- DESeq argument 'minReplicatesForReplace' = 7
-- original counts are preserved in counts(dds)
estimating dispersions
fitting model and testing
res <- results(dds)
summary(res)
out of 82 with nonzero total read count
adjusted p-value < 0.1
LFC > 0 (up) : 40, 49%
LFC < 0 (down) : 36, 44%
outliers [1] : 0, 0%
low counts [2] : 0, 0%
(mean count < 5)
[1] see 'cooksCutoff' argument of ?results
[2] see 'independentFiltering' argument of ?results
Prettify the display of results.
deseq.results <- res
deseq.bbl.data <- data.frame(
row.names = rownames(deseq.results),
baseMean = deseq.results$baseMean,
log2FoldChange = deseq.results$log2FoldChange,
lfcSE = deseq.results$lfcSE,
stat = deseq.results$stat,
pvalue = deseq.results$pvalue,
padj = deseq.results$padj,
cancer_type = "Colon"
)
deseq.bbl.data
Filter based on p-value and log fold change cutoffs.
deseq.bbl.data.filtered <- dplyr::filter(deseq.bbl.data, abs(log2FoldChange) >= 1.5 & padj < 0.05)
deseq.bbl.data.filtered
Plot the results.
ggplot(deseq.bbl.data.filtered, aes(x = cancer_type, y = rownames(deseq.bbl.data.filtered), size = padj, fill = log2FoldChange)) +
geom_point(alpha = 0.5, shape = 21, color = "black") +
scale_size(trans = "reverse") +
scale_fill_gradient2(low = "blue", mid = "white", high = "red", limits = c(min(deseq.bbl.data.filtered$log2FoldChange),max(deseq.bbl.data.filtered$log2FoldChange))) +
theme_minimal() +
theme(legend.position = "bottom") +
theme(legend.position = "bottom") +
labs(size = "Adjusted p-value", fill = "log2 FC", x = "Cancer type", y = "miRNA")
Fetch the ferroptosis gene set.
ferroptosis.genes <- msigdbr(species = "human", category = "C2", subcategory = "CP:WIKIPATHWAYS") %>%
dplyr::filter(gs_name == "WP_FERROPTOSIS")
ferroptosis.genes
Get the gene symbols of the genes included in the ferroptosis gene set, and convert them to RefSeq IDs.
rcd_gene_symbols <- ferroptosis.genes$gene_symbol
rcd_refseq <- getBM(attributes = c("refseq_mrna", "hgnc_symbol"), filters = "hgnc_symbol", values = rcd_gene_symbols, mart = mart)
rcd_refseq
Write the RefSeq IDs of the mRNAs of interest to a file.
rcd_mrna_file <- "temp/ferroptosis-genes-refseq.txt"
rcd_refseq.unique_ids <- unique(unlist(rcd_refseq$refseq_mrna))
rcd_refseq.unique_ids <- rcd_refseq.unique_ids[!sapply(rcd_refseq.unique_ids, identical, "")]
# Regenerate the file every time
if (file.exists(rcd_mrna_file)) {
file.remove(rcd_mrna_file)
}
invisible(lapply(rcd_refseq.unique_ids, write, rcd_mrna_file, append = TRUE, ncolumns = 1))
Run the following Python script to fetch the miRNAs targeting the mRNAs of interest:
python util/get-mirna.py --mrna-list temp/ferroptosis-genes-refseq.txt --output temp/ferroptosis-mirna.txt
Running this script should generate a file named
ferroptosis-mirna.txt
inside the
temp
directory.
Fetch the miRNAs from the said file.
ferroptosis.mirna <- read.table("temp/ferroptosis-mirna.txt")
ferroptosis.mirna <- unique(unlist(ferroptosis.mirna$V1))
Filter the genes to include only those in the ferroptosis gene set.
coad_ferroptosis <- count_matrix[rownames(count_matrix) %in% ferroptosis.mirna, ]
coad_ferroptosis <- coad_ferroptosis[, rownames(samples)]
# Check if all samples in the counts dataframe are in the samples dataframe
all(colnames(coad_ferroptosis) == rownames(samples))
Perform differential miRNA expression analysis.
dds <- DESeqDataSetFromMatrix(
countData = coad_ferroptosis,
colData = samples,
design = ~type
)
Warning: some variables in design formula are characters, converting to factors
dds <- filter_genes(dds, min_count = 10)
dds$type <- relevel(dds$type, ref = "normal")
dds <- DESeq(dds)
estimating size factors
estimating dispersions
gene-wise dispersion estimates
mean-dispersion relationship
final dispersion estimates
fitting model and testing
-- replacing outliers and refitting for 13 genes
-- DESeq argument 'minReplicatesForReplace' = 7
-- original counts are preserved in counts(dds)
estimating dispersions
fitting model and testing
res <- results(dds)
summary(res)
out of 256 with nonzero total read count
adjusted p-value < 0.1
LFC > 0 (up) : 114, 45%
LFC < 0 (down) : 100, 39%
outliers [1] : 0, 0%
low counts [2] : 0, 0%
(mean count < 4)
[1] see 'cooksCutoff' argument of ?results
[2] see 'independentFiltering' argument of ?results
Prettify the display of results.
deseq.results <- res
deseq.bbl.data <- data.frame(
row.names = rownames(deseq.results),
baseMean = deseq.results$baseMean,
log2FoldChange = deseq.results$log2FoldChange,
lfcSE = deseq.results$lfcSE,
stat = deseq.results$stat,
pvalue = deseq.results$pvalue,
padj = deseq.results$padj,
cancer_type = "Colon"
)
deseq.bbl.data
Filter based on p-value and log fold change cutoffs.
deseq.bbl.data.filtered <- dplyr::filter(deseq.bbl.data, abs(log2FoldChange) >= 1.5 & padj < 0.05)
deseq.bbl.data.filtered
Plot the results.
ggplot(deseq.bbl.data.filtered, aes(x = cancer_type, y = rownames(deseq.bbl.data.filtered), size = padj, fill = log2FoldChange)) +
geom_point(alpha = 0.5, shape = 21, color = "black") +
scale_size(trans = "reverse") +
scale_fill_gradient2(low = "blue", mid = "white", high = "red", limits = c(min(deseq.bbl.data.filtered$log2FoldChange),max(deseq.bbl.data.filtered$log2FoldChange))) +
theme_minimal() +
theme(legend.position = "bottom") +
theme(legend.position = "bottom") +
labs(size = "Adjusted p-value", fill = "log2 FC", x = "Cancer type", y = "miRNA")
Fetch the pyroptosis gene set.
pyroptosis.genes <- msigdbr(species = "human", category = "C2", subcategory = "CP:REACTOME") %>%
dplyr::filter(gs_name == "REACTOME_PYROPTOSIS")
pyroptosis.genes
Get the gene symbols of the genes included in the pyroptosis gene set, and convert them to RefSeq IDs.
rcd_gene_symbols <- pyroptosis.genes$gene_symbol
rcd_refseq <- getBM(attributes = c("refseq_mrna", "hgnc_symbol"), filters = "hgnc_symbol", values = rcd_gene_symbols, mart = mart)
rcd_refseq
Write the RefSeq IDs of the mRNAs of interest to a file.
rcd_mrna_file <- "temp/pyroptosis-genes-refseq.txt"
rcd_refseq.unique_ids <- unique(unlist(rcd_refseq$refseq_mrna))
rcd_refseq.unique_ids <- rcd_refseq.unique_ids[!sapply(rcd_refseq.unique_ids, identical, "")]
# Regenerate the file every time
if (file.exists(rcd_mrna_file)) {
file.remove(rcd_mrna_file)
}
invisible(lapply(rcd_refseq.unique_ids, write, rcd_mrna_file, append = TRUE, ncolumns = 1))
Run the following Python script to fetch the miRNAs targeting the mRNAs of interest:
python util/get-mirna.py --mrna-list temp/pyroptosis-genes-refseq.txt --output temp/pyroptosis-mirna.txt
Running this script should generate a file named
pyroptosis-mirna.txt
inside the
temp
directory.
Fetch the miRNAs from the said file.
pyroptosis.mirna <- read.table("temp/pyroptosis-mirna.txt")
pyroptosis.mirna <- unique(unlist(pyroptosis.mirna$V1))
Filter the genes to include only those in the pyroptosis gene set.
coad_pyroptosis <- count_matrix[rownames(count_matrix) %in% pyroptosis.mirna, ]
coad_pyroptosis <- coad_pyroptosis[, rownames(samples)]
# Check if all samples in the counts dataframe are in the samples dataframe
all(colnames(coad_pyroptosis) == rownames(samples))
Perform differential miRNA expression analysis.
dds <- DESeqDataSetFromMatrix(
countData = coad_pyroptosis,
colData = samples,
design = ~type
)
Warning: some variables in design formula are characters, converting to factors
dds <- filter_genes(dds, min_count = 10)
dds$type <- relevel(dds$type, ref = "normal")
dds <- DESeq(dds)
estimating size factors
estimating dispersions
gene-wise dispersion estimates
mean-dispersion relationship
final dispersion estimates
fitting model and testing
-- replacing outliers and refitting for 9 genes
-- DESeq argument 'minReplicatesForReplace' = 7
-- original counts are preserved in counts(dds)
estimating dispersions
fitting model and testing
res <- results(dds)
summary(res)
out of 184 with nonzero total read count
adjusted p-value < 0.1
LFC > 0 (up) : 84, 46%
LFC < 0 (down) : 71, 39%
outliers [1] : 0, 0%
low counts [2] : 0, 0%
(mean count < 4)
[1] see 'cooksCutoff' argument of ?results
[2] see 'independentFiltering' argument of ?results
Prettify the display of results.
deseq.results <- res
deseq.bbl.data <- data.frame(
row.names = rownames(deseq.results),
baseMean = deseq.results$baseMean,
log2FoldChange = deseq.results$log2FoldChange,
lfcSE = deseq.results$lfcSE,
stat = deseq.results$stat,
pvalue = deseq.results$pvalue,
padj = deseq.results$padj,
cancer_type = "Colon"
)
deseq.bbl.data
Filter based on p-value and log fold change cutoffs.
deseq.bbl.data.filtered <- dplyr::filter(deseq.bbl.data, abs(log2FoldChange) >= 1.5 & padj < 0.05)
deseq.bbl.data.filtered
Plot the results.
ggplot(deseq.bbl.data.filtered, aes(x = cancer_type, y = rownames(deseq.bbl.data.filtered), size = padj, fill = log2FoldChange)) +
geom_point(alpha = 0.5, shape = 21, color = "black") +
scale_size(trans = "reverse") +
scale_fill_gradient2(low = "blue", mid = "white", high = "red", limits = c(min(deseq.bbl.data.filtered$log2FoldChange),max(deseq.bbl.data.filtered$log2FoldChange))) +
theme_minimal() +
theme(legend.position = "bottom") +
theme(legend.position = "bottom") +
labs(size = "Adjusted p-value", fill = "log2 FC", x = "Cancer type", y = "miRNA")
De La Salle University, Manila, Philippines, gonzales.markedward@gmail.com↩︎
De La Salle University, Manila, Philippines, anish.shrestha@dlsu.edu.ph↩︎